Load dataset

data <- read.csv("/Users/bebexmimi/Downloads/archive (1)/AB_NYC_2019.csv")
head(data)
##     id                                             name host_id   host_name
## 1 2539               Clean & quiet apt home by the park    2787        John
## 2 2595                            Skylit Midtown Castle    2845    Jennifer
## 3 3647              THE VILLAGE OF HARLEM....NEW YORK !    4632   Elisabeth
## 4 3831                  Cozy Entire Floor of Brownstone    4869 LisaRoxanne
## 5 5022 Entire Apt: Spacious Studio/Loft by central park    7192       Laura
## 6 5099        Large Cozy 1 BR Apartment In Midtown East    7322       Chris
##   neighbourhood_group neighbourhood latitude longitude       room_type price
## 1            Brooklyn    Kensington 40.64749 -73.97237    Private room   149
## 2           Manhattan       Midtown 40.75362 -73.98377 Entire home/apt   225
## 3           Manhattan        Harlem 40.80902 -73.94190    Private room   150
## 4            Brooklyn  Clinton Hill 40.68514 -73.95976 Entire home/apt    89
## 5           Manhattan   East Harlem 40.79851 -73.94399 Entire home/apt    80
## 6           Manhattan   Murray Hill 40.74767 -73.97500 Entire home/apt   200
##   minimum_nights number_of_reviews last_review reviews_per_month
## 1              1                 9  2018-10-19              0.21
## 2              1                45  2019-05-21              0.38
## 3              3                 0                            NA
## 4              1               270  2019-07-05              4.64
## 5             10                 9  2018-11-19              0.10
## 6              3                74  2019-06-22              0.59
##   calculated_host_listings_count availability_365
## 1                              6              365
## 2                              2              355
## 3                              1              365
## 4                              1              194
## 5                              1                0
## 6                              1              129
summary(data)
##        id               name              host_id           host_name        
##  Min.   :    2539   Length:48895       Min.   :     2438   Length:48895      
##  1st Qu.: 9471945   Class :character   1st Qu.:  7822033   Class :character  
##  Median :19677284   Mode  :character   Median : 30793816   Mode  :character  
##  Mean   :19017143                      Mean   : 67620011                     
##  3rd Qu.:29152178                      3rd Qu.:107434423                     
##  Max.   :36487245                      Max.   :274321313                     
##                                                                              
##  neighbourhood_group neighbourhood         latitude       longitude     
##  Length:48895        Length:48895       Min.   :40.50   Min.   :-74.24  
##  Class :character    Class :character   1st Qu.:40.69   1st Qu.:-73.98  
##  Mode  :character    Mode  :character   Median :40.72   Median :-73.96  
##                                         Mean   :40.73   Mean   :-73.95  
##                                         3rd Qu.:40.76   3rd Qu.:-73.94  
##                                         Max.   :40.91   Max.   :-73.71  
##                                                                         
##   room_type             price         minimum_nights    number_of_reviews
##  Length:48895       Min.   :    0.0   Min.   :   1.00   Min.   :  0.00   
##  Class :character   1st Qu.:   69.0   1st Qu.:   1.00   1st Qu.:  1.00   
##  Mode  :character   Median :  106.0   Median :   3.00   Median :  5.00   
##                     Mean   :  152.7   Mean   :   7.03   Mean   : 23.27   
##                     3rd Qu.:  175.0   3rd Qu.:   5.00   3rd Qu.: 24.00   
##                     Max.   :10000.0   Max.   :1250.00   Max.   :629.00   
##                                                                          
##  last_review        reviews_per_month calculated_host_listings_count
##  Length:48895       Min.   : 0.010    Min.   :  1.000               
##  Class :character   1st Qu.: 0.190    1st Qu.:  1.000               
##  Mode  :character   Median : 0.720    Median :  1.000               
##                     Mean   : 1.373    Mean   :  7.144               
##                     3rd Qu.: 2.020    3rd Qu.:  2.000               
##                     Max.   :58.500    Max.   :327.000               
##                     NA's   :10052                                   
##  availability_365
##  Min.   :  0.0   
##  1st Qu.:  0.0   
##  Median : 45.0   
##  Mean   :112.8   
##  3rd Qu.:227.0   
##  Max.   :365.0   
## 

Data preparation

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data <- data %>% select(-id, -host_name, -last_review,-name)
colSums(is.na(data))
##                        host_id            neighbourhood_group 
##                              0                              0 
##                  neighbourhood                       latitude 
##                              0                              0 
##                      longitude                      room_type 
##                              0                              0 
##                          price                 minimum_nights 
##                              0                              0 
##              number_of_reviews              reviews_per_month 
##                              0                          10052 
## calculated_host_listings_count               availability_365 
##                              0                              0
data$reviews_per_month[is.na(data$reviews_per_month)] <- mean(data$reviews_per_month, na.rm = TRUE)
colSums(is.na(data))
##                        host_id            neighbourhood_group 
##                              0                              0 
##                  neighbourhood                       latitude 
##                              0                              0 
##                      longitude                      room_type 
##                              0                              0 
##                          price                 minimum_nights 
##                              0                              0 
##              number_of_reviews              reviews_per_month 
##                              0                              0 
## calculated_host_listings_count               availability_365 
##                              0                              0
library(corrplot)
## corrplot 0.95 loaded
num_data <- data %>% select_if(is.numeric)

cor_matrix <- cor(num_data)
corrplot(cor_matrix, method = "color", tl.col = "black")

library(ggplot2)
ggplot(data, aes(x=price)) + 
    geom_histogram(bins = 30, fill = "blue") + 
    ggtitle("The raw distribution of prices") + labs(x="Price")

data$log_price <- log(data$price+1)
data <- data %>%
  select(-price)
ggplot(data, aes(x=log_price)) + 
    geom_histogram(bins = 30, fill = "blue") +
    ggtitle("The raw distribution of prices") + labs(x="Price")

library(dplyr)
library(leaflet)

# Select relevant columns including pre-computed log_price
to_leaflet_pre <- data %>%
  select(longitude, latitude, neighbourhood_group, neighbourhood, room_type, log_price)

# Cap log_price for visualization purposes
to_leaflet_pre$log_price[to_leaflet_pre$log_price <= 3] <- 3
to_leaflet_pre$log_price[to_leaflet_pre$log_price >= 7] <- 7

# Center of map
center_lon <- median(to_leaflet_pre$longitude, na.rm = TRUE)
center_lat <- median(to_leaflet_pre$latitude, na.rm = TRUE)

# Color palette for log_price
pal <- colorNumeric(
  palette = colorRampPalette(c('red', 'yellow', 'green'))(length(to_leaflet_pre$log_price)),
  domain = to_leaflet_pre$log_price
)

# Create interactive map
leaflet(data = to_leaflet_pre) %>%
  addTiles() %>%
  addCircleMarkers(
    lng = ~longitude,
    lat = ~latitude,
    popup = ~paste0(
      "<strong>Region:</strong> ", neighbourhood_group, ", ", neighbourhood, "<br>",
      "<strong>Room type:</strong> ", room_type, "<br>",
      "<strong>Log price:</strong> ", round(log_price, 2), "<br>"
    ),
    color = ~pal(log_price),
    radius = 5,
    fillOpacity = 0.7,
    stroke = FALSE,
    clusterOptions = markerClusterOptions()
  ) %>%
  setView(lng = center_lon, lat = center_lat, zoom = 10) %>%
  addLegend(
    position = "topleft",
    pal = pal,
    values = ~log_price,
    title = "Log Prices of Airbnb Listings",
    opacity = 1
  )  

Model Discussion

GAMs

library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
library(caret)
## Loading required package: lattice
set.seed(123)
train_index <- createDataPartition(data$log_price, p = 0.8, list = FALSE)

train_data <- data[train_index, ]
test_data <- data[-train_index, ]
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
## 
## Attaching package: 'mgcv'
## The following objects are masked from 'package:gam':
## 
##     gam, gam.control, gam.fit, s
library(rsample)  
library(Metrics) 
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(purrr)    
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:caret':
## 
##     lift
## The following objects are masked from 'package:foreach':
## 
##     accumulate, when
set.seed(123) 
folds <- vfold_cv(data, v = 5)  

aic_values <- c()
bic_values <- c()
adj_r2_values <- c()
rmse_values <- c()

cv_results <- map_dfr(folds$splits, function(split) {
  train_split <- analysis(split)
  val_split <- assessment(split)
  
  gam_model <- gam(
    log_price ~ s(latitude) + s(longitude) + s(minimum_nights) +
      s(number_of_reviews) + s(reviews_per_month) +
      s(calculated_host_listings_count) + s(availability_365) +
      neighbourhood_group + room_type,
    data = train_split
  )
  
  preds <- predict(gam_model, newdata = val_split)
  
  rmse_error <- rmse(val_split$log_price, preds)
  
  gam_summary <- summary(gam_model)
  aic_value <- AIC(gam_model)
  bic_value <- BIC(gam_model)
  adj_r2_value <- gam_summary$r.sq 
  
  aic_values <<- c(aic_values, aic_value)
  bic_values <<- c(bic_values, bic_value)
  adj_r2_values <<- c(adj_r2_values, adj_r2_value)
  rmse_values <<- c(rmse_values, rmse_error)
  
  data.frame(
    AIC = aic_value,
    BIC = bic_value,
    Adj_R2 = adj_r2_value,
    RMSE = rmse_error
  )
})

cat("Average AIC:", mean(aic_values), "\n")
## Average AIC: 51093.13
cat("Average BIC:", mean(bic_values), "\n")
## Average BIC: 51637.82
cat("Average Adjusted R²:", mean(adj_r2_values), "\n")
## Average Adjusted R²: 0.55362
cat("Average RMSE:", mean(rmse_values), "\n")
## Average RMSE: 0.4650892

Boosting

library(gbm)
## Loaded gbm 2.2.2
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
set.seed(123)
control <- trainControl(method = "cv", number = 5)
gbm_model <- train(
  log_price ~ latitude + longitude + minimum_nights + number_of_reviews +
    reviews_per_month + calculated_host_listings_count + availability_365 +
    neighbourhood_group + room_type,
  data = train_data,
  method = "gbm",
  trControl = control,
  tuneLength = 5,
  verbose = FALSE
)

print(gbm_model)
## Stochastic Gradient Boosting 
## 
## 39117 samples
##     9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 31295, 31295, 31294, 31292, 31292 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE       Rsquared   MAE      
##   1                   50      0.5057970  0.4913537  0.3636289
##   1                  100      0.4833654  0.5283927  0.3449918
##   1                  150      0.4738119  0.5416858  0.3375333
##   1                  200      0.4690479  0.5485371  0.3341971
##   1                  250      0.4666565  0.5517995  0.3325737
##   2                   50      0.4801048  0.5351175  0.3425768
##   2                  100      0.4643370  0.5582257  0.3307043
##   2                  150      0.4592904  0.5656327  0.3272888
##   2                  200      0.4569101  0.5695199  0.3258748
##   2                  250      0.4554581  0.5720984  0.3247969
##   3                   50      0.4707650  0.5497830  0.3354047
##   3                  100      0.4579274  0.5684302  0.3262670
##   3                  150      0.4544735  0.5740498  0.3238697
##   3                  200      0.4523963  0.5777319  0.3224183
##   3                  250      0.4512094  0.5798783  0.3214524
##   4                   50      0.4657537  0.5576253  0.3317610
##   4                  100      0.4549156  0.5737034  0.3242083
##   4                  150      0.4517824  0.5790069  0.3218254
##   4                  200      0.4501156  0.5819272  0.3205804
##   4                  250      0.4489459  0.5840324  0.3197992
##   5                   50      0.4620750  0.5632416  0.3291552
##   5                  100      0.4524728  0.5779501  0.3224659
##   5                  150      0.4497843  0.5826146  0.3202886
##   5                  200      0.4480907  0.5856444  0.3189426
##   5                  250      0.4467777  0.5880243  0.3179834
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were n.trees = 250, interaction.depth =
##  5, shrinkage = 0.1 and n.minobsinnode = 10.
summary(gbm_model)

##                                                               var     rel.inf
## room_typePrivate room                       room_typePrivate room 48.97834882
## longitude                                               longitude 13.03437708
## room_typeShared room                         room_typeShared room 10.49956878
## latitude                                                 latitude  8.73159804
## availability_365                                 availability_365  5.84517929
## minimum_nights                                     minimum_nights  3.94520960
## neighbourhood_groupManhattan         neighbourhood_groupManhattan  3.35662901
## number_of_reviews                               number_of_reviews  2.31646769
## calculated_host_listings_count     calculated_host_listings_count  2.27427586
## reviews_per_month                               reviews_per_month  0.87305383
## neighbourhood_groupBrooklyn           neighbourhood_groupBrooklyn  0.11792785
## neighbourhood_groupQueens               neighbourhood_groupQueens  0.02736416
## neighbourhood_groupStaten Island neighbourhood_groupStaten Island  0.00000000
par(mfrow = c(1, 2))
plot(gbm_model$finalModel, i = "longitude", main = "Partial Dependence: longitude")

par(mfrow = c(1, 2))
plot(gbm_model$finalModel, i = "latitude", main = "Partial Dependence: latitude")

train_preds <- predict(gbm_model, newdata = train_data)
test_preds <- predict(gbm_model, newdata = test_data)
# RMSE
rmse_train <- sqrt(mean((train_data$log_price - train_preds)^2))
rmse_test <- sqrt(mean((test_data$log_price - test_preds)^2))
# R²
r2_train <- 1 - sum((train_data$log_price - train_preds)^2) / sum((train_data$log_price - mean(train_data$log_price))^2)
r2_test <- 1 - sum((test_data$log_price - test_preds)^2) / sum((test_data$log_price - mean(test_data$log_price))^2)

# Number of predictors (excluding intercept)
p <- length(gbm_model$finalModel$var.names)

# Sample sizes
n_train <- nrow(train_data)
n_test <- nrow(test_data)

# Adjusted R²
adj_r2_train <- 1 - ((1 - r2_train) * (n_train - 1)) / (n_train - p - 1)
adj_r2_test <- 1 - ((1 - r2_test) * (n_test - 1)) / (n_test - p - 1)
cat("Train RMSE:", rmse_train, "\n")
## Train RMSE: 0.4356909
cat("Test RMSE:", rmse_test, "\n")
## Test RMSE: 0.4445946
cat("Train Adjusted R²:", adj_r2_train, "\n")
## Train Adjusted R²: 0.6080001
cat("Test Adjusted R²:", adj_r2_test, "\n")
## Test Adjusted R²: 0.5874943